NORTHERN RANGE DATASET

Preliminaries

library(PLNmodels)
This is package 'PLNmodels' version 1.2.2
Use future::plan(multicore/multisession) to speed up PLNPCA/PLNmixture/stability_selection.
library(Metrics)
library(pheatmap)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(FactoMineR)
library(factoextra)
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(corrplot)
corrplot 0.95 loaded
library(aricode)
library(viridisLite)

Northern Range dataset

load("NORTHERN_RANGE_DATASET_COUNTS.Rdata")
NR_COUNTS <- NORTHERN_RANGE_DATASET_COUNTS %>% as_tibble()
NR_COUNTS <- NR_COUNTS[, -21]

load("NORTHERN_RANGE_DATASET_COVARIATES.Rdata")
NR_COVARIATES <- 
  NORTHERN_RANGE_DATASET_COVARIATES %>% as_tibble() %>% 
  rename(latitude = LATITUDE, 
         longitude = LONGITUDE, 
         long_lat = LONGLAT,
         coarse_gravel = coarse.gravel, 
         fine_gravel = fine.gravel, 
         leaf_litter = leaf.litter,
         time_step = TimeStep, year = YEAR, month = MONTH, 
         stream = STREAM, 
         disturbance = DISTURBANCE, # human activity / binary categorical variable
         altitude = ALTITUDE, 
         season = SEASON) %>% 
  mutate(
    season = as.character(season),
    year = as.character(year),
    disturbance = ifelse(disturbance == 1, "yes", "no")
  ) %>% 
  ## flow is not reliable
  ## time_step + month are redundant with season + year
  ## site is redudant with disturbance + stream
  ## long_lat is the 1st PCA axis of latitude + longitude
  dplyr::select(-long_lat, -site, -time_step, -month, -flow) %>%
  relocate(latitude, longitude, altitude, width, depth, volume, garbage,
           conductivity, O2, pH, temperature, turbidity, # site features
          coarse_gravel, fine_gravel, leaf_litter, cobble, sand, silt, boulders, canopy, # soil
          season, year, # sampling time
          stream, disturbance # categorical: place + human activity
         )
# turbidity: ordinal variable indicating the level of turbidity
# season:-> dry-start/dry-end, wet-start/wet-end, (january-> may: dry)
quali_ind <- seq(ncol(NR_COVARIATES) - 3, ncol(NR_COVARIATES))

PCA on Covariates

nb_pc <- ncol(NR_COVARIATES) - 4
NR_PCA <-
  NR_COVARIATES %>%
  PCA(quali.sup = quali_ind, ncp = nb_pc, scale.unit = TRUE, graph = FALSE)
## no effect of season and year
## strong effect of stream and disturbance
NR_PCA %>% fviz_pca_biplot(habillage = quali_ind) + scale_color_viridis_d()
Warning: `gather_()` was deprecated in tidyr 1.2.0.
ℹ Please use `gather()` instead.
ℹ The deprecated feature was likely used in the factoextra package.
  Please report the issue at <https://github.com/kassambara/factoextra/issues>.

NR_PCA %>% fviz_eig()

NR_PCA_SCORES <- setNames(data.frame(NR_PCA$ind$coord), paste0("PC",1:nb_pc))
NR_DATA <- prepare_data(
  counts     = NR_COUNTS,
  covariates = cbind(NR_COVARIATES, NR_PCA_SCORES)
  )

PLN: multivariate regression

NR_PLN           <- PLN(Abundance ~ 1, data = NR_DATA)

 Initialization...
 Adjusting a full covariance PLN model with nlopt optimizer
 Post-treatments...
 DONE!
NR_PLN_Stream    <- PLN(Abundance ~ 0 + stream, data = NR_DATA)

 Initialization...
 Adjusting a full covariance PLN model with nlopt optimizer
 Post-treatments...
 DONE!
NR_PLN_Stream_PC <- PLN(Abundance ~ 0 + stream + PC1, data = NR_DATA)

 Initialization...
 Adjusting a full covariance PLN model with nlopt optimizer
 Post-treatments...
 DONE!
criteria <- rbind(NR_PLN$criteria, NR_PLN_Stream$criteria, NR_PLN_Stream_PC$criteria) %>% add_column(model = c("PLN ~ 1", "PLN ~ stream",  "PLN ~ stream + PC1"), .before = 1) 
B_pln <- t(coefficients(NR_PLN_Stream_PC))
colnames(B_pln) <- c(sort(unique(NR_DATA$stream)), "PC1")
mu_pln <- predict(NR_PLN_Stream_PC, NR_DATA)

pheatmap(B_pln)

pheatmap(mu_pln, show_rownames = FALSE)

counts  <- NR_DATA$Abundance  %>% as("matrix")
fit <- as.data.frame(NR_PLN_Stream_PC$fitted) %>% 
  setNames(colnames(NR_DATA$Abundance))
plot_counts <- counts %>% as_tibble(rownames = "sample") %>% 
  pivot_longer(-sample, names_to = "species", values_to = "count")
plot_fitted_pln <- fit %>% as_tibble(rownames = "sample") %>% 
  pivot_longer(-sample, names_to = "species", values_to = "Y_hat") 

dplot <-
  inner_join(plot_counts, plot_fitted_pln, by = join_by(sample, species)) %>%  
  inner_join(NR_COVARIATES %>%  as_tibble(rownames = "sample"), by = "sample")

p <- dplot %>% 
  ggplot(aes(x = count, y = Y_hat)) + 
  geom_point(alpha = 0.25) +
  geom_abline(slope = 1, intercept = 0, color = "grey60", linetype = "dashed") + 
  scale_x_log10() + scale_y_log10() + 
  labs(x = "Observed", y = "Fitted") + 
  facet_grid(disturbance~season) + 
  theme_bw()
p
Warning in scale_x_log10(): log-10 transformation introduced infinite values.

PLNPCA: dimension reduction and vizualisation

NR_PLNPCA_all <- PLNPCA(Abundance ~ PC1, ranks = 1:12, data = NR_DATA)

 Initialization...

 Adjusting 12 PLN models for PCA analysis.
     Rank approximation = 9 
     Rank approximation = 10 
     Rank approximation = 8 
     Rank approximation = 11 
     Rank approximation = 7 
     Rank approximation = 4 
     Rank approximation = 1 
     Rank approximation = 6 
     Rank approximation = 2 
     Rank approximation = 5 
     Rank approximation = 3 
     Rank approximation = 12 
 Post-treatments
 DONE!
plot(NR_PLNPCA_all)

NR_PLNPCA <- NR_PLNPCA_all %>% getBestModel("ICL")
NR_PLNPCA %>% fviz_pca_biplot(col.ind = NR_DATA$stream) + scale_color_viridis_d()
Warning: The shape palette can deal with a maximum of 6 discrete values because more
than 6 becomes difficult to discriminate
ℹ you have requested 8 values. Consider specifying shapes manually if you need
  that many of them.
Warning: Removed 76 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

NR_PLNPCA %>% fviz_pca_biplot(col.ind = NR_DATA$temperature) + scale_color_viridis_c()

NR_PLNPCA_Stream_all <- PLNPCA(Abundance ~ 0 + stream + PC1 , ranks = 1:10, data = NR_DATA)

 Initialization...

 Adjusting 10 PLN models for PCA analysis.
     Rank approximation = 10 
     Rank approximation = 8 
     Rank approximation = 2 
     Rank approximation = 6 
     Rank approximation = 4 
     Rank approximation = 1 
     Rank approximation = 5 
     Rank approximation = 9 
     Rank approximation = 7 
     Rank approximation = 3 
 Post-treatments
 DONE!
plot(NR_PLNPCA_Stream_all)

NR_PLNPCA_Stream <- NR_PLNPCA_Stream_all %>% getBestModel("ICL") 
NR_PLNPCA_Stream %>% fviz_pca_biplot(col.ind = NR_DATA$stream) + scale_color_viridis_d()
Warning: The shape palette can deal with a maximum of 6 discrete values because more
than 6 becomes difficult to discriminate
ℹ you have requested 8 values. Consider specifying shapes manually if you need
  that many of them.
Warning: Removed 76 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

B_pca <- t(coefficients(NR_PLNPCA_Stream))
colnames(B_pca) <- c(sort(unique(NR_DATA$stream)), "PC1")
mu_pca <- predict(NR_PLNPCA_Stream, NR_DATA)

pheatmap(B_pca)

pheatmap(mu_pca, show_rownames = FALSE)

corrplot(cov2cor(sigma(NR_PLN_Stream_PC)), is.corr = FALSE, order="hclust", method = "color")

corrplot(cov2cor(sigma(NR_PLNPCA)), is.corr = FALSE, order="hclust", method = "color")

corrplot(cov2cor(sigma(NR_PLNPCA_Stream)), is.corr = FALSE, order="hclust", method = "color")

fit_pca <- as.data.frame(NR_PLNPCA_Stream$fitted) %>% 
  setNames(colnames(NR_DATA$Abundance))
plot_fitted_pca <- fit_pca %>% as_tibble(rownames = "sample") %>% 
  pivot_longer(-sample, names_to = "species", values_to = "Y_hat") 

dplot <- 
  inner_join(plot_counts, plot_fitted_pca, by = join_by(sample, species)) %>%  
  inner_join(NR_COVARIATES %>%  as_tibble(rownames = "sample"), by = "sample")

p <- dplot %>% 
  ggplot(aes(x = count, y = Y_hat)) + 
  geom_point(alpha = 0.25) +
  geom_abline(slope = 1, intercept = 0, color = "grey60", linetype = "dashed") + 
  scale_x_log10() + scale_y_log10() + 
  labs(x = "Observed", y = "Fitted") + 
  facet_grid(disturbance~season) + 
  theme_bw()
p
Warning in scale_x_log10(): log-10 transformation introduced infinite values.

criteria <- criteria %>% bind_rows( 
                  data.frame(model = "PLNPCA ~ PC1", NR_PLNPCA$criteria),
                  data.frame(model = "PLNPCA ~ stream + PC1", NR_PLNPCA_Stream$criteria))

PLNLDA: Discriminant analysis + Classifier

NR_PLNLDA <- PLNLDA(Abundance ~ PC1, grouping = stream, data = NR_DATA)

 Performing discriminant Analysis...
 DONE!
plot(NR_PLNLDA, map = "individual") + scale_color_viridis_d()

plot(NR_PLNLDA, map = "variable") + scale_color_viridis_d()

group_means <- NR_PLNLDA$group_means
colnames(group_means) <- sort(unique(NR_DATA$stream))
rownames(group_means) <- colnames(NR_DATA$Abundance)
pheatmap(group_means)

set.seed(1969)
train_set <- sample.int(nrow(NR_DATA), 200)
test_set  <- setdiff(1:nrow(NR_DATA), train_set)
NR_train <- NR_DATA[train_set, ]
NR_test   <- NR_DATA[test_set, ]

classifier <- PLNLDA(Abundance ~ 1 + PC1 + PC2 + PC3 + PC4, grouping = stream, data = NR_train) 

 Performing discriminant Analysis...
 DONE!
pred_stream <- predict(classifier, newdata = NR_test, type = "response")
aricode::ARI(pred_stream, NR_test$stream)
[1] 0.8369625

Mixture models

NR_PLNMM_all <- PLNmixture(Abundance ~ 1 + PC1, clusters = 1:8, data = NR_DATA)

 Initialization...

 Adjusting 8 PLN mixture models.
    number of cluster = 1 
    number of cluster = 2 
    number of cluster = 3 
    number of cluster = 4 
    number of cluster = 5 
    number of cluster = 6 
    number of cluster = 7 
    number of cluster = 8 

 Smoothing PLN mixture models.
   Going backward +++++++
                                                                                                    
   Going forward +++++++
                                                                                                    
 Post-treatments
 DONE!
NR_PLNMM <- NR_PLNMM_all %>% getBestModel("BIC")
NR_PLNMM$plot_clustering_pca()

NR_PLNPCA %>% fviz_pca_biplot(col.ind = factor(NR_PLNMM$memberships)) + scale_colour_viridis_d()

NR_PLNPCA %>% fviz_pca_biplot(col.ind = factor(NR_DATA$stream)) + scale_colour_viridis_d()

ARI(NR_PLNMM$memberships, NR_DATA$stream)
[1] 0.3431925

Association Network

NR_PLNNET_all <- PLNnetwork(Abundance ~ PC1, data = NR_DATA, control = PLNnetwork_param(penalize_diagonal = FALSE))

 Initialization...
 Adjusting 30 PLN with sparse inverse covariance estimation
    Joint optimization alternating gradient descent and graphical-lasso
    sparsifying penalty = 7.000889 
    sparsifying penalty = 6.466517 
    sparsifying penalty = 5.972934 
    sparsifying penalty = 5.517025 
    sparsifying penalty = 5.095915 
    sparsifying penalty = 4.706948 
    sparsifying penalty = 4.347671 
    sparsifying penalty = 4.015817 
    sparsifying penalty = 3.709293 
    sparsifying penalty = 3.426166 
    sparsifying penalty = 3.16465 
    sparsifying penalty = 2.923095 
    sparsifying penalty = 2.699977 
    sparsifying penalty = 2.49389 
    sparsifying penalty = 2.303534 
    sparsifying penalty = 2.127707 
    sparsifying penalty = 1.965301 
    sparsifying penalty = 1.815291 
    sparsifying penalty = 1.676732 
    sparsifying penalty = 1.548748 
    sparsifying penalty = 1.430534 
    sparsifying penalty = 1.321342 
    sparsifying penalty = 1.220485 
    sparsifying penalty = 1.127327 
    sparsifying penalty = 1.041279 
    sparsifying penalty = 0.9617988 
    sparsifying penalty = 0.8883856 
    sparsifying penalty = 0.8205758 
    sparsifying penalty = 0.757942 
    sparsifying penalty = 0.7000889 
 Post-treatments
 DONE!
NR_PLNNET_all$stability_selection()

Stability Selection for PLNnetwork: 
subsampling: ++++++++++++++++++++
plot(NR_PLNNET_all, "stability", stability = 0.95)

NR_PLNNET <- NR_PLNNET_all$getBestModel("StARS")
NR_PLNNET$plot_network()

NR_PLNNET_Stream_all <- PLNnetwork(Abundance ~ PC1 + stream, data = NR_DATA, control = PLNnetwork_param(penalize_diagonal = FALSE))

 Initialization...
 Adjusting 30 PLN with sparse inverse covariance estimation
    Joint optimization alternating gradient descent and graphical-lasso
    sparsifying penalty = 0.7024127 
    sparsifying penalty = 0.6487981 
    sparsifying penalty = 0.5992759 
    sparsifying penalty = 0.5535337 
    sparsifying penalty = 0.511283 
    sparsifying penalty = 0.4722572 
    sparsifying penalty = 0.4362102 
    sparsifying penalty = 0.4029146 
    sparsifying penalty = 0.3721605 
    sparsifying penalty = 0.3437538 
    sparsifying penalty = 0.3175154 
    sparsifying penalty = 0.2932797 
    sparsifying penalty = 0.2708939 
    sparsifying penalty = 0.2502168 
    sparsifying penalty = 0.231118 
    sparsifying penalty = 0.2134769 
    sparsifying penalty = 0.1971824 
    sparsifying penalty = 0.1821317 
    sparsifying penalty = 0.1682297 
    sparsifying penalty = 0.1553889 
    sparsifying penalty = 0.1435282 
    sparsifying penalty = 0.1325728 
    sparsifying penalty = 0.1224536 
    sparsifying penalty = 0.1131068 
    sparsifying penalty = 0.1044735 
    sparsifying penalty = 0.09649913 
    sparsifying penalty = 0.08913343 
    sparsifying penalty = 0.08232995 
    sparsifying penalty = 0.07604578 
    sparsifying penalty = 0.07024127 
 Post-treatments
 DONE!
NR_PLNNET_Stream_all$stability_selection()

Stability Selection for PLNnetwork: 
subsampling: ++++++++++++++++++++
plot(NR_PLNNET_Stream_all, "stability", stability = 0.95)

NR_PLNNET_Stream <- NR_PLNNET_Stream_all$getBestModel("StARS")
NR_PLNNET_Stream$plot_network()

criteria <- criteria %>% bind_rows( 
                  data.frame(model = "PLNNET ~ PC1", NR_PLNNET$criteria[, 1:4]),
                  data.frame(model = "PLNNET ~ stream + PC1", NR_PLNNET_Stream$criteria[, 1:4]))

Zero Inflation in PLN

NR_ZIPLN  <- ZIPLN(Abundance ~ PC1 | 0 + stream,  data = NR_DATA)

 Initialization...
 Adjusting a ZI-PLN model with full covariance model and covar specific parameter(s) in Zero inflation component.
 DONE!
NR_ZIPLNNET_all <- ZIPLNnetwork(Abundance ~ PC1 | 0 + stream,  data = NR_DATA, control = ZIPLNnetwork_param(penalize_diagonal = FALSE))

 Initialization...
 Adjusting 30 ZI-PLN with sparse inverse covariance estimation and covar specific parameter(s) in Zero inflation component.
    sparsifying penalty = 0.7553838 
    sparsifying penalty = 0.697726 
    sparsifying penalty = 0.6444692 
    sparsifying penalty = 0.5952774 
    sparsifying penalty = 0.5498404 
    sparsifying penalty = 0.5078715 
    sparsifying penalty = 0.4691061 
    sparsifying penalty = 0.4332997 
    sparsifying penalty = 0.4002263 
    sparsifying penalty = 0.3696773 
    sparsifying penalty = 0.3414602 
    sparsifying penalty = 0.3153968 
    sparsifying penalty = 0.2913229 
    sparsifying penalty = 0.2690864 
    sparsifying penalty = 0.2485473 
    sparsifying penalty = 0.2295759 
    sparsifying penalty = 0.2120526 
    sparsifying penalty = 0.1958668 
    sparsifying penalty = 0.1809164 
    sparsifying penalty = 0.1671072 
    sparsifying penalty = 0.1543521 
    sparsifying penalty = 0.1425705 
    sparsifying penalty = 0.1316882 
    sparsifying penalty = 0.1216366 
    sparsifying penalty = 0.1123522 
    sparsifying penalty = 0.1037764 
    sparsifying penalty = 0.09585526 
    sparsifying penalty = 0.08853871 
    sparsifying penalty = 0.08178063 
    sparsifying penalty = 0.07553838 
 DONE!
NR_ZIPLNNET <- NR_ZIPLNNET_all$getBestModel("BIC")
NR_ZIPLNNET$plot_network()

criteria <- criteria %>% bind_rows( 
                  data.frame(model = "ZIPLN ~ PC1 | stream", NR_ZIPLN$criteria),
                  data.frame(model = "ZIPLNNET ~ PC1 | stream", NR_ZIPLNNET$criteria[, 1:4]))
criteria %>% gt::gt()
model nb_param loglik BIC ICL
PLN ~ 1 230 -9393.133 -10050.592 -17212.929
PLN ~ stream 370 -8292.598 -9350.248 -14740.558
PLN ~ stream + PC1 390 -8206.340 -9321.161 -14573.897
PLNPCA ~ PC1 159 -6565.395 -7019.898 -6699.009
PLNPCA ~ stream + PC1 270 -5881.358 -6653.157 -6562.650
PLNNET ~ PC1 77 -9619.069 -9839.174 -16524.285
PLNNET ~ stream + PC1 211 -8312.991 -8916.138 -14484.045
ZIPLN ~ PC1 | stream 290 -8548.828 -9377.797 -14456.248
ZIPLNNET ~ PC1 | stream 117 -8587.148 -8921.594 -14322.612
fit_ZIPLN <- as.data.frame(NR_ZIPLN$fitted) %>% 
  setNames(colnames(NR_DATA$Abundance))
latent_ZIPLN <- as.data.frame(NR_ZIPLN$latent) %>% 
  setNames(colnames(NR_DATA$Abundance))
latent_pos_ZIPLN <- as.data.frame(NR_ZIPLN$latent_pos) %>% 
  setNames(colnames(NR_DATA$Abundance))
B_ZIPLN <- as.data.frame(NR_ZIPLN$model_par$B) %>% 
  setNames(colnames(NR_DATA$Abundance))
B0_ZIPLN <- as.data.frame(NR_ZIPLN$model_par$B0) %>% 
  setNames(colnames(NR_DATA$Abundance))
prob_zi <- as.data.frame(NR_ZIPLN$var_par$R) %>% 
  setNames(colnames(NR_DATA$Abundance))

Parameters analysis

site <- as.data.frame(NR_COVARIATES[, c("disturbance", "season")])
pheatmap(prob_zi, annotation_row = site)

pheatmap(latent_ZIPLN)

pheatmap(latent_pos_ZIPLN)

pheatmap(B_pln)

pheatmap(B_pca)

pheatmap(B0_ZIPLN)

prcomp(latent_ZIPLN, scale. = TRUE) %>% fviz_pca_biplot(col.ind = NR_COVARIATES$stream)
Warning: The shape palette can deal with a maximum of 6 discrete values because more
than 6 becomes difficult to discriminate
ℹ you have requested 8 values. Consider specifying shapes manually if you need
  that many of them.
Warning: Removed 76 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).